import os
import pandas as pd

data_path = '../input/'
os.chdir(data_path)
print(os.getcwd())

# 1.读取数据
df_protein_train = pd.read_csv('df_protein_train.csv')
df_protein_test = pd.read_csv('df_protein_test.csv')

protein_concat = pd.concat([df_protein_train, df_protein_test])
# print(protein_concat.iloc[336])
print(protein_concat.head())


# print(protein_concat['Sequence'])

def getP(P):
    AA = "WCMHYFQNIRDPTKEVSGAL"
    #        W    C    M      H      Y    F    Q     N      I    R    D   P     T      K    E     V      S    G      A       L
    GRP = [[1.000, 1.000, 24.68, 24.68, 1.000, 1.000, 1.000, 13.34, 1.0, 1.0, 1.0, 1.0, -14.03, 1.0, 1.0, -7.49, 1.0, -9.37, -14.03, 13.34],
           [24.68, 1.000, 33.60, 33.60, 1.000, 1.000, -6.54, 1.000, 1.0, 1.0, 20.26, 20.26, 33.6, 1.0, 1.0, -6.54, 1.0, 1.0, 1.0, 20.26],
           [1.000, 1.000, -1.88, 58.28, 24.68, 1.000, -6.54, 1.000, 1.0, -6.54, 1.0, 44.94, -1.88, 1.0, 1.0, 1.0, 44.94, 1.0, 13.34, 1.0],
           [-1.88, 1.000, 1.000, 1.000, 44.94, -9.37, 1.000, 24.68, 44.94, 1.000, 1.000, -1.880, -6.54, 24.68, 1.000, 1.000, 1.000, -9.37, 1.000, 1.000],
           [-9.37, 1.000, 44.94, 13.34, 13.34, 1.000, 1.000, 1.000, 1.000, -15.91, 24.68, 13.34, -7.49, 1.0, -6.54, 1.0, 1.0, -7.49, 24.68, 1.0],
           [1.0, 1.0, 1.0, 1.0, 1.0, 33.6, 1.0, 1.0, 1.0, 1.0, 1.0, 13.34, 20.26, 1.0, -14.03, 1.0, 1.0, 1.0,1.0,1.0, 1.0],
           [1.0, -6.54, 1.0, 1.0, -6.54, -6.54, 20.26, 1.0, 1.0, 1.0, 20.26, 20.26, 1.0, 1.0, 20.26, -6.54, 44.94, 1.0, 1.0, 1.0],
           [-9.37, -1.88, 1.0, 1.0, 1.0, -14.03, -6.54, 1.0, 44.94, 1.0, 1.0, -1.88, -7.49, 24.68, 1.0, 1.0, 1.0, -14.03, 1.0, 1.0],
           [1.0, 1.0, 1.0, 13.34, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.88, 1.0, -7.49, 44.94, -7.49, 1.0,1.0,1.0,20.26],
           [58.28, 1.0, 1.0, 20.26, -6.54, 1.0, 20.26, 13.34, 1.0, 58.28, 1.0, 20.26, 1.0, 1.0, 1.0,1.0, 44.94, -7.49, 1.0, 1.0],
           [1.0, 1.0, 1.0, 1.0, 1.0, -6.54, 1.0, 1.0, 1.0, -6.54, 1.0, 1.0, -14.03, -7.49, 1.0, 1.0, 20.26, 1.0, 1.0, 1.0],
           [-1.88, -6.54, -6.54, 1.0, 1.0, 20.26, 20.26, 1.0, 1.0, -6.54, -6.54, 20.26, 1.0, 1.0, 18.38, 20.26, 20.26, 1.0, 20.26, 1.0],
           [-14.03, 1.0, 1.0, 1.0, 1.0, 13.34, -6.54, -14.03, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 20.26, 1.0, 1.0, -7.49, 1.0, 1.0],
           [1.0, 1.0, 33.6, 1.0, 1.0,1.0, 24.68, 1.0, -7.49, 33.6, 1.0, -6.54, 1.0, 1.0, 1.0, -7.49, 1.0, -7.49, 1.0, -7.49],
           [-14.03, 44.94, 1.0, -6.54, 1.0, 1.0, 20.26, 1.0, 20.26, 1.0, 20.26, 20.26, 1.0, 1.0, 33.6, 1.0, 20.26, 1.0, 1.0, 1.0],
           [1.0, 1.0, 1.0, 1.0, -6.54, 1.0, 1.0, 1.0, 1.0, 1.0, -14.03, 20.26, -7.49, -1.88, 1.0, 1.0, 1.0, -7.49, 1.0, 1.0],
           [1.0, 33.6, 1.0, 1.0, 1.0, 1.0, 20.26, 1.0, 1.0, 20.26, 1.0, 44.94, 1.0, 1.0, 20.26, 1.0, 20.26, 1.0, 1.0, 1.0],
           [13.34, 1.0, 1.0, 1.0, -7.49, 1.0, 1.0, -7.49, -7.49, 1.0, 1.0, 1.0, -7.49, -7.49, -6.54, 1.0, 1.0, 13.34, -7.49, 1.0],
           [1.0, 44.94, 1.0, 1.0, -7.49, 1.0, 1.0, 1.0, 1.0,1.0,1.0, -7.49, 20.26, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
           [24.68, 1.0, 1.0, 1.0, 1.0, 1.0, 33.6, 1.0, 1.0, 20.26, 1.0, 20.26, 1.0, -7.49, 1.0, 1.0,1.0, 1.0, 1.0, 1.0],
           ]

    P_length = len(P)
    II = 0
    for index, i in enumerate(P):
        if index == P_length - 1:
            break
        x1 = AA.index(i)
        x2 = AA.index(P[index + 1])
        II += GRP[x1][x2]
    II = II * 10 / P_length

    return round(II,8)


# 构建蛋白质的相对分子质量、Pi值 、 晓光系数
charc = pd.DataFrame([getP(i.upper()) for i in protein_concat['Sequence']])
# print charc
charc.columns = ['II']
# charc.columns = ['pi', 'xx']
charc['II'] = charc.II.astype(float)
charc['Protein_ID'] = pd.Series(protein_concat['Protein_ID'].values)
print(charc.head())
# charc.to_csv('protein_getp.csv', sep=',', header=True, index=False)
# charc.to_csv('protein_getp_no_drop.csv', sep=',', header=True, index=False)
charc.to_csv('../output/protein_get_II.csv', sep=',', header=True, index=False)
